Chip 2007 January, February, March & April

home *** CD-ROM | disk | FTP | other *** search

/ Chip 2007 January, February, March & April / Chip-Cover-CD-2007-02.iso / Pakiet bezpieczenstwa / mini Pentoo LiveCD 2006.1 / mpentoo-2006.1.iso / livecd.squashfs / usr / lib / python2.4 / robotparser.pyo (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2005-10-18 | 9KB | 313 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.4) ''' robotparser.py Copyright (C) 2000 Bastian Kleineidam You can choose between two licenses when using this package: 1) GNU GPLv2 2) PSF license for Python 2.2 The robots.txt Exclusion Protocol is implemented as specified in http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html ''' import urlparse import urllib __all__ = [ 'RobotFileParser'] debug = 0 def _debug(msg): if debug: print msg class RobotFileParser: ''' This class provides a set of methods to read, parse and answer questions about a single robots.txt file. ''' def __init__(self, url = ''): self.entries = [] self.default_entry = None self.disallow_all = False self.allow_all = False self.set_url(url) self.last_checked = 0 def mtime(self): '''Returns the time the robots.txt file was last fetched. This is useful for long-running web spiders that need to check for new robots.txt files periodically. ''' return self.last_checked def modified(self): '''Sets the time the robots.txt file was last fetched to the current time. ''' import time as time self.last_checked = time.time() def set_url(self, url): '''Sets the URL referring to a robots.txt file.''' self.url = url (self.host, self.path) = urlparse.urlparse(url)[1:3] def read(self): '''Reads the robots.txt URL and feeds it to the parser.''' opener = URLopener() f = opener.open(self.url) lines = [] line = f.readline() while line: lines.append(line.strip()) line = f.readline() self.errcode = opener.errcode if self.errcode == 401 or self.errcode == 403: self.disallow_all = True _debug('disallow all') elif self.errcode >= 400: self.allow_all = True _debug('allow all') elif self.errcode == 200 and lines: _debug('parse lines') self.parse(lines) def _add_entry(self, entry): if '*' in entry.useragents: self.default_entry = entry else: self.entries.append(entry) def parse(self, lines): '''parse the input lines from a robots.txt file. We allow that a user-agent: line is not preceded by one or more blank lines.''' state = 0 linenumber = 0 entry = Entry() for line in lines: linenumber = linenumber + 1 if not line: if state == 1: _debug('line %d: warning: you should insert allow: or disallow: directives below any user-agent: line' % linenumber) entry = Entry() state = 0 elif state == 2: self._add_entry(entry) entry = Entry() state = 0 i = line.find('#') if i >= 0: line = line[:i] line = line.strip() if not line: continue line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() line[1] = urllib.unquote(line[1].strip()) if line[0] == 'user-agent': if state == 2: _debug('line %d: warning: you should insert a blank line before any user-agent directive' % linenumber) self._add_entry(entry) entry = Entry() entry.useragents.append(line[1]) state = 1 elif line[0] == 'disallow': if state == 0: _debug('line %d: error: you must insert a user-agent: directive before this line' % linenumber) else: entry.rulelines.append(RuleLine(line[1], False)) state = 2 elif line[0] == 'allow': if state == 0: _debug('line %d: error: you must insert a user-agent: directive before this line' % linenumber) else: entry.rulelines.append(RuleLine(line[1], True)) else: _debug('line %d: warning: unknown key %s' % (linenumber, line[0])) line[0] == 'user-agent' _debug('line %d: error: malformed line %s' % (linenumber, line)) if state == 2: self.entries.append(entry) _debug('Parsed rules:\n%s' % str(self)) def can_fetch(self, useragent, url): '''using the parsed robots.txt decide if useragent can fetch url''' _debug('Checking robots.txt allowance for:\n user agent: %s\n url: %s' % (useragent, url)) if self.disallow_all: return False if self.allow_all: return True if not urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]): pass url = '/' for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) continue if self.default_entry: return self.default_entry.allowance(url) return True def __str__(self): ret = '' for entry in self.entries: ret = ret + str(entry) + '\n' return ret class RuleLine: '''A rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path.''' def __init__(self, path, allowance): if path == '' and not allowance: allowance = True self.path = urllib.quote(path) self.allowance = allowance def applies_to(self, filename): if not self.path == '*': pass return filename.startswith(self.path) def __str__(self): if not self.allowance or 'Allow': pass return 'Disallow' + ': ' + self.path class Entry: '''An entry has one or more user-agents and zero or more rulelines''' def __init__(self): self.useragents = [] self.rulelines = [] def __str__(self): ret = '' for agent in self.useragents: ret = ret + 'User-agent: ' + agent + '\n' for line in self.rulelines: ret = ret + str(line) + '\n' return ret def applies_to(self, useragent): '''check if this entry applies to the specified agent''' useragent = useragent.split('/')[0].lower() for agent in self.useragents: if agent == '*': return True agent = agent.lower() if agent in useragent: return True continue return False def allowance(self, filename): '''Preconditions: - our agent applies to this entry - filename is URL decoded''' for line in self.rulelines: _debug((filename, str(line), line.allowance)) if line.applies_to(filename): return line.allowance continue return True class URLopener(urllib.FancyURLopener): def __init__(self, *args): urllib.FancyURLopener.__init__(self, *args) self.errcode = 200 def http_error_default(self, url, fp, errcode, errmsg, headers): self.errcode = errcode return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers) def _check(a, b): if not b: ac = 'access denied' else: ac = 'access allowed' if a != b: print 'failed' else: print 'ok (%s)' % ac print def _test(): global debug rp = RobotFileParser() debug = 1 rp.set_url('http://www.musi-cal.com/robots.txt') rp.read() _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) _check(rp.can_fetch('CherryPickerSE', 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco'), 0) _check(rp.can_fetch('CherryPickerSE/1.0', 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco'), 0) _check(rp.can_fetch('CherryPickerSE/1.5', 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco'), 0) _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) rp.set_url('http://www.lycos.com/robots.txt') rp.read() _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) if __name__ == '__main__': _test()